import pandas as pd
df = pd.read_csv('creditCustomers.csv')
df.describe()
| CLIENTNUM | Customer_Age | Dependent_count | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1 | Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.012700e+04 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 |
| mean | 7.391776e+08 | 46.325960 | 2.346203 | 35.928409 | 3.812580 | 2.341167 | 2.455317 | 8631.953698 | 1162.814061 | 7469.139637 | 0.759941 | 4404.086304 | 64.858695 | 0.712222 | 0.274894 | 0.159997 | 0.840003 |
| std | 3.690378e+07 | 8.016814 | 1.298908 | 7.986416 | 1.554408 | 1.010622 | 1.106225 | 9088.776650 | 814.987335 | 9090.685324 | 0.219207 | 3397.129254 | 23.472570 | 0.238086 | 0.275691 | 0.365301 | 0.365301 |
| min | 7.080821e+08 | 26.000000 | 0.000000 | 13.000000 | 1.000000 | 0.000000 | 0.000000 | 1438.300000 | 0.000000 | 3.000000 | 0.000000 | 510.000000 | 10.000000 | 0.000000 | 0.000000 | 0.000008 | 0.000420 |
| 25% | 7.130368e+08 | 41.000000 | 1.000000 | 31.000000 | 3.000000 | 2.000000 | 2.000000 | 2555.000000 | 359.000000 | 1324.500000 | 0.631000 | 2155.500000 | 45.000000 | 0.582000 | 0.023000 | 0.000099 | 0.999660 |
| 50% | 7.179264e+08 | 46.000000 | 2.000000 | 36.000000 | 4.000000 | 2.000000 | 2.000000 | 4549.000000 | 1276.000000 | 3474.000000 | 0.736000 | 3899.000000 | 67.000000 | 0.702000 | 0.176000 | 0.000181 | 0.999820 |
| 75% | 7.731435e+08 | 52.000000 | 3.000000 | 40.000000 | 5.000000 | 3.000000 | 3.000000 | 11067.500000 | 1784.000000 | 9859.000000 | 0.859000 | 4741.000000 | 81.000000 | 0.818000 | 0.503000 | 0.000337 | 0.999900 |
| max | 8.283431e+08 | 73.000000 | 5.000000 | 56.000000 | 6.000000 | 6.000000 | 6.000000 | 34516.000000 | 2517.000000 | 34516.000000 | 3.397000 | 18484.000000 | 139.000000 | 3.714000 | 0.999000 | 0.999580 | 0.999990 |
df.head()
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | ... | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1 | Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 768805383 | Existing Customer | 45 | M | 3 | High School | Married | $60K - $80K | Blue | 39 | ... | 12691.0 | 777 | 11914.0 | 1.335 | 1144 | 42 | 1.625 | 0.061 | 0.000093 | 0.99991 |
| 1 | 818770008 | Existing Customer | 49 | F | 5 | Graduate | Single | Less than $40K | Blue | 44 | ... | 8256.0 | 864 | 7392.0 | 1.541 | 1291 | 33 | 3.714 | 0.105 | 0.000057 | 0.99994 |
| 2 | 713982108 | Existing Customer | 51 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36 | ... | 3418.0 | 0 | 3418.0 | 2.594 | 1887 | 20 | 2.333 | 0.000 | 0.000021 | 0.99998 |
| 3 | 769911858 | Existing Customer | 40 | F | 4 | High School | Unknown | Less than $40K | Blue | 34 | ... | 3313.0 | 2517 | 796.0 | 1.405 | 1171 | 20 | 2.333 | 0.760 | 0.000134 | 0.99987 |
| 4 | 709106358 | Existing Customer | 40 | M | 3 | Uneducated | Married | $60K - $80K | Blue | 21 | ... | 4716.0 | 0 | 4716.0 | 2.175 | 816 | 28 | 2.500 | 0.000 | 0.000022 | 0.99998 |
5 rows × 23 columns
Numerical Features: Customer_Age Dependent_count Months_on_book Credit_Limit Total_Revolving_Bal Avg_Open_To_Buy Total_Amt_Chng_Q4_Q1 Total_Trans_Amt Total_Trans_Ct Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio Categorical Features: These represent categories or labels. Examples from your dataset: Attrition_Flag (binary category: “Attrited Customer” or “Existing Customer”) Gender (e.g., “Male” or “Female”) Education_Level (e.g., “High School”, “Graduate”, etc.) Marital_Status (e.g., “Single”, “Married”, etc.) Income_Category (e.g., “Less than $40K”, “$40K - $60K”, etc.) Card_Category (e.g., “Blue”, “Silver”, “Gold”, “Platinum”)
import matplotlib.pyplot as plt
feature1 = 'Customer_Age'
feature2 = 'Total_Trans_Amt'
# Create the scatter plot
plt.figure(figsize=(8, 6))
plt.scatter(df[feature1], df[feature2], alpha=0.5, color='b')
plt.xlabel(f'{feature1}')
plt.ylabel(f'{feature2}')
plt.title(f'Scatter Plot: {feature1} vs. {feature2}')
plt.grid(True)
plt.show()
import matplotlib.pyplot as plt
feature1 = 'Customer_Age'
feature2 = 'Credit_Limit'
# Create the scatter plot
plt.figure(figsize=(8, 6))
plt.scatter(df[feature1], df[feature2], alpha=0.5, color='b')
plt.xlabel(f'{feature1}')
plt.ylabel(f'{feature2}')
plt.title(f'Scatter Plot: {feature1} vs. {feature2}')
plt.grid(True)
plt.show()
import seaborn as sns
selected_features = ['Customer_Age', 'Dependent_count', 'Months_on_book','Credit_Limit','Total_Revolving_Bal','Avg_Open_To_Buy','Total_Amt_Chng_Q4_Q1','Total_Trans_Amt','Total_Trans_Ct','Total_Ct_Chng_Q4_Q1','Avg_Utilization_Ratio']
sns.pairplot(df[selected_features])
/Users/apple/anaconda3/lib/python3.11/site-packages/seaborn/axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
<seaborn.axisgrid.PairGrid at 0x14777a510>
feature = 'Customer_Age'
plt.figure(figsize=(8, 6))
plt.hist(df[feature], bins=20, color='skyblue', edgecolor='black')
plt.xlabel(f'{feature}')
plt.ylabel('Frequency')
plt.title(f'Histogram: {feature}')
plt.grid(True)
plt.show()
import numpy as np
feature = 'Total_Trans_Amt'
plt.figure(figsize=(8, 6))
plt.hist(df[feature], bins=20, color='skyblue', edgecolor='black')
plt.xlabel(f'{feature}')
plt.ylabel('Frequency')
plt.title(f'Histogram: {feature}')
plt.grid(True)
plt.show()
feature = 'Total_Trans_Amt'
plt.figure(figsize=(8, 6))
plt.hist(np.log(df[feature]), bins=20, color='skyblue', edgecolor='black')
plt.xlabel(f'{feature}')
plt.ylabel('Frequency')
plt.title(f'Histogram: {feature}')
plt.grid(True)
plt.show()
import seaborn as sns
feature = 'Total_Trans_Amt'
plt.figure(figsize=(8, 6))
sns.boxplot(x=df[feature])
plt.xlabel(f'{feature}')
plt.title(f'Box Plot: {feature}')
plt.grid(True)
plt.show()
import seaborn as sns
feature = 'Customer_Age'
plt.figure(figsize=(8, 6))
sns.boxplot(x=df[feature])
plt.xlabel(f'{feature}')
plt.title(f'Box Plot: {feature}')
plt.grid(True)
plt.show()
feature = 'Total_Trans_Amt'
plt.figure(figsize=(8, 6))
sns.boxplot(x=df[feature])
plt.xlabel(f'{feature}')
plt.title(f'Box Plot: {feature}')
plt.grid(True)
plt.show()
feature = 'Avg_Open_To_Buy'
plt.figure(figsize=(8, 6))
sns.boxplot(x=df[feature])
plt.xlabel(f'{feature}')
plt.title(f'Box Plot: {feature}')
plt.grid(True)
plt.show()
feature = 'Total_Trans_Amt'
skewness = df[feature].skew()
mininum = df[feature].min()
maximum = df[feature].max()
mean = df[feature].mean()
median = df[feature].median()
print(f'Min of {feature}: {mininum:.2f}')
print(f'Max of {feature}: {maximum:.2f}')
print(f'Median of {feature}: {median:.2f}')
print(f'Mean of {feature}: {mean:.2f}')
print(f'Skewness of {feature}: {skewness:.2f}')
Min of Total_Trans_Amt: 510.00 Max of Total_Trans_Amt: 18484.00 Median of Total_Trans_Amt: 3899.00 Mean of Total_Trans_Amt: 4404.09 Skewness of Total_Trans_Amt: 2.04
Task 2
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
df = pd.read_csv('creditCustomers.csv')
cat_feature = 'Card_Category'
num_feature = 'Customer_Age'
plt.figure(figsize=(8, 6))
sns.boxplot(x=cat_feature, y=num_feature, data=df)
plt.xlabel(f'{cat_feature}')
plt.ylabel(f'{num_feature}')
plt.title(f'Boxplot: {num_feature} by {cat_feature}')
plt.grid(True)
plt.show()
num_feature = 'Total_Trans_Amt'
# Calculate mean
mean = df[num_feature].mean()
print(f'Mean of {num_feature}: {mean:.2f}')
# Calculate median
median = df[num_feature].median()
print(f'Median of {num_feature}: {median:.2f}')
# Calculate standard deviation
std = df[num_feature].std()
print(f'Standard Deviation of {num_feature}: {std:.2f}')
Mean of Total_Trans_Amt: 4404.09 Median of Total_Trans_Amt: 3899.00 Standard Deviation of Total_Trans_Amt: 3397.13
# Choose two numerical features for the correlation plot
feature1 = 'Total_Trans_Amt'
feature2 = 'Customer_Age'
#todo zov zurah
# Create the correlation plot
plt.figure(figsize=(8, 6))
sns.regplot(x=feature1, y=feature2, data=df)
plt.xlabel(f'{feature1}')
plt.ylabel(f'{feature2}')
plt.title(f'Correlation Plot: {feature1} vs. {feature2}')
plt.grid(True)
plt.show()
import scipy.stats as stats
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv('creditCustomers.csv')
# Choose the numerical feature for the normality check
num_feature = 'Total_Trans_Amt'
# todo normality plot oor solih
# Plot histogram
# Choose the numerical feature for the normality check
num_feature = 'Total_Trans_Amt'
# Create a Q-Q plot
plt.figure(figsize=(8, 6))
stats.probplot(df[num_feature], dist="norm", plot=plt)
plt.title(f'Normal Probability Plot: {num_feature}')
plt.grid(True)
plt.show()
# Calculate skewness
skewness = df[num_feature].skew()
print(f'Skewness of {num_feature}: {skewness:.2f}')
Skewness of Total_Trans_Amt: 2.04
import pandas as pd
import matplotlib.pyplot as plt
# Load your dataset
df = pd.read_csv('creditCustomers.csv')
# Define the numerical feature you want to categorize
num_feature = 'Total_Trans_Amt'
# Create bins for the numerical feature based on its range
max_value = df[num_feature].max()
bins = [0, 6000, 9000, 12000, max_value]
# Use the cut function to create a new categorical column in the DataFrame
df['Category_Feature'] = pd.cut(df[num_feature], bins, labels=['Low', 'Medium', 'High', 'Very High'])
# Now you can group the data by the new 'Category_Feature'
grouped_data = df.groupby('Category_Feature')
# Plot histograms and calculate skewness for each category
for name, group in grouped_data:
plt.figure(figsize=(8, 6))
plt.hist(group[num_feature], bins=20, color='skyblue', edgecolor='black')
plt.xlabel(f'{name} {num_feature}')
plt.ylabel('Frequency')
plt.title(f'Histogram of {num_feature} for {name} category')
plt.grid(True)
plt.show()
# Calculate and print skewness for each category
skewness = group[num_feature].skew()
print(f'Skewness of {num_feature} for {name} category: {skewness:.2f}')
Skewness of Total_Trans_Amt for Low category: -0.15
Skewness of Total_Trans_Amt for Medium category: -0.28
Skewness of Total_Trans_Amt for High category: 1.82
Skewness of Total_Trans_Amt for Very High category: 0.24
from scipy.stats import shapiro
import pandas as pd
# Choose the numerical feature for the Shapiro-Wilk test
num_feature = 'Total_Trans_Amt'
# Perform the Shapiro-Wilk test
stat, p = shapiro(df[num_feature])
print(f'Shapiro-Wilk Test Statistic: {stat:.4f}')
print(f'Shapiro-Wilk Test P-value: {p:.4f}')
# Interpret the result
alpha = 0.05 # significance level
if p > alpha:
print('Fail to reject the null hypothesis. The feature is normally distributed.')
else:
print('Reject the null hypothesis. The feature is not normally distributed.')
Shapiro-Wilk Test Statistic: 0.7465 Shapiro-Wilk Test P-value: 0.0000 Reject the null hypothesis. The feature is not normally distributed.
/Users/apple/anaconda3/lib/python3.11/site-packages/scipy/stats/_morestats.py:1882: UserWarning: p-value may not be accurate for N > 5000.
warnings.warn("p-value may not be accurate for N > 5000.")
from scipy.stats import ks_2samp
# Choose two numerical features for the two-sample Kolmogorov-Smirnov test
feature1 = 'Total_Trans_Amt'
feature2 = 'Customer_Age'
# Perform the two-sample Kolmogorov-Smirnov test
stat, p = ks_2samp(df[feature1], df[feature2])
print(f'Two-Sample Kolmogorov-Smirnov Test Statistic: {stat:.4f}')
print(f'Two-Sample Kolmogorov-Smirnov Test P-value: {p:.4f}')
# Interpret the result
alpha = 0.05 # significance level
if p > alpha:
print('Fail to reject the null hypothesis. The two features have the same distribution.')
else:
print('Reject the null hypothesis. The two features have different distributions.')
Two-Sample Kolmogorov-Smirnov Test Statistic: 1.0000 Two-Sample Kolmogorov-Smirnov Test P-value: 0.0000 Reject the null hypothesis. The two features have different distributions.